{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright (c) Microsoft Corporation. All rights reserved.\n",
    "\n",
    "Licensed under the MIT License"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Distributed Training For Extractive Summarization on CNN/DM Dataset\n",
    "\n",
    "## Summary\n",
    "This notebook demonstrates how to use Azure Machine Learning to run distributed training using Distributed Data Parallel in Pytorch for extractive summarization. For more detailed model related information, please see [extractive_summarization_cnndm_transformer.ipynb](extractive_summarization_cnndm_transformer.ipynb)\n",
    "\n",
    "## Prerequisites\n",
    "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, refer to the [Configuration Notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) first if you haven't already to establish your connection to the AzureML Workspace. Prerequisites are:\n",
    "\n",
    "- Azure subscription\n",
    "- Azure Machine Learning Workspace\n",
    "- Azure Machine Learning SDK\n",
    "\n",
    "To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). \n",
    "\n",
    "You can run this notebook on CPU-only machines."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "from tempfile import TemporaryDirectory\n",
    "import torch\n",
    "\n",
    "import azureml.core\n",
    "from azureml.core import Experiment, Workspace, Run\n",
    "from azureml.core.compute import ComputeTarget, AmlCompute\n",
    "from azureml.core.compute_target import ComputeTargetException\n",
    "from azureml.train.dnn import PyTorch\n",
    "from azureml.train.dnn import Nccl\n",
    "from azureml.widgets import RunDetails\n",
    "\n",
    "nlp_path = os.path.abspath(\"../../\")\n",
    "if nlp_path not in sys.path:\n",
    "    sys.path.insert(0, nlp_path)\n",
    "from utils_nlp.azureml.azureml_utils import get_or_create_workspace\n",
    "from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset\n",
    "from utils_nlp.eval import compute_rouge_python\n",
    "from utils_nlp.models.transformers.extractive_summarization import (\n",
    "    ExtractiveSummarizer,\n",
    "    ExtSumProcessedData,\n",
    "    ExtSumProcessor,\n",
    ")\n",
    "# Check core SDK version number\n",
    "print(\"SDK version:\", azureml.core.VERSION)\n",
    "\n",
    "import pprint"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Configuration "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for Azure ML Workspacen\n",
    "SUBSRIPTION_ID = \"YOUR_SUBSCRIPTION_ID\"\n",
    "LOCATION = \"YOUR_RESOURCE_GROUP_NAME\"  # example \"eastus2\"\n",
    "RESOURCE_GROUP = \"YOUR_WORKSPACE_NAME\"  # modifiy to use your own\n",
    "WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\"  # modifiy to use your own\n",
    "\n",
    "# for creating Azure ML Compute Cluster\n",
    "AMLCOMPUTE_CLUSTER_NAME = \"bertsumext\"  # modifiy to use your own\n",
    "NODE_COUNT = 2\n",
    "VM_SIZE = \"STANDARD_NC6\"  # this should be the VM that's supported by Azure and Azure ML\n",
    "\n",
    "\n",
    "# for creating Azure ML Experiment\n",
    "EXPERIMENT_NAME = \"NLP-ExtSum\"  # modifiy to use your own\n",
    "\n",
    "\n",
    "# local folder to save the downloaded data\n",
    "LOCAL_DATA_FOLDER = (\n",
    "    \"./bertsumext_aml/data/\"\n",
    ")  # modify to use your own, the penultimate level folder should exist\n",
    "LOCAL_CACHE_DIR = (\n",
    "    \"./bertsumext_aml/cache/\"\n",
    ") \n",
    "# Training related parameter\n",
    "MODEL_NAME = \"distilbert-base-uncased\"  # limited choice\n",
    "ENCODER = \"transformer\"\n",
    "# folder in the workspace where the data is uploaded to\n",
    "TARGET_DATA_FOLDER = \"/bertsum_processed_data\"  # modify to use your own\n",
    "TARGET_OUTPUT_DIR = f\"output/{EXPERIMENT_NAME}/\"\n",
    "# cache dir in the workspace\n",
    "TARGET_CACHE_DIR = f\"cache/{EXPERIMENT_NAME}/\"\n",
    "\n",
    "TRAIN_FILE = \"train.pt\"\n",
    "TEST_FILE = \"test.pt\"\n",
    "# file name for saving the prediction\n",
    "SUMMARY_FILENAME = \"generated_summaries.txt\"\n",
    "# file name for saving the trained model\n",
    "MODEL_FILENAME = \"dist_extsum.pt\"\n",
    "\n",
    "\n",
    "# local path to download the output from the cluster\n",
    "LOCAL_OUTPUT_DIR = \"./bertsumext_aml/output\"  # modifiy to use your own, the penultimate level folder\n",
    "\n",
    "\n",
    "# local folder to store all the related files to be copied to the workspace\n",
    "PROJECT_FOLDER = \"./azureml_exp\"\n",
    "# conda environment name, the yaml file will be copied to the workspace\n",
    "CONDA_ENV_NAME = \"nlp_gpu\"\n",
    "\n",
    "##\n",
    "# The number of lines at the head of data file used for preprocessing. -1 means all the lines.\n",
    "TOP_N = 100\n",
    "QUICK_RUN = True\n",
    "if not QUICK_RUN:\n",
    "    TOP_N = -1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create an AML Workspace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create the workspace using the specified parameters\n",
    "ws = get_or_create_workspace(\n",
    "    workspace_name=WORKSPACE_NAME,\n",
    "    subscription_id=SUBSRIPTION_ID,\n",
    "    resource_group=RESOURCE_GROUP,\n",
    "    workspace_region=LOCATION,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\n",
    "    \"Workspace name: \" + ws.name,\n",
    "    \"Azure region: \" + ws.location,\n",
    "    \"Subscription id: \" + ws.subscription_id,\n",
    "    \"Resource group: \" + ws.resource_group,\n",
    "    sep=\"\\n\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create an AML GPU Compute Cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    gpu_compute_target = ComputeTarget(workspace=ws, name=AMLCOMPUTE_CLUSTER_NAME)\n",
    "    print(\"Found existing compute target.\")\n",
    "except ComputeTargetException:\n",
    "    print(\"Creating a new compute target...\")\n",
    "    compute_config = AmlCompute.provisioning_configuration(\n",
    "        vm_size=VM_SIZE, max_nodes=NODE_COUNT, \n",
    "        idle_seconds_before_scaledown=\"600\"\n",
    "    )\n",
    "\n",
    "    # create the cluster\n",
    "    gpu_compute_target = ComputeTarget.create(\n",
    "        ws, AMLCOMPUTE_CLUSTER_NAME, compute_config\n",
    "    )\n",
    "\n",
    "    gpu_compute_target.wait_for_completion(show_output=True)\n",
    "\n",
    "# use get_status() to get a detailed status for the current AmlCompute.\n",
    "print(gpu_compute_target.get_status().serialize())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create an Experiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment = Experiment(ws, name=EXPERIMENT_NAME)\n",
    "ds = ws.get_default_datastore()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset, test_dataset = CNNDMSummarizationDataset(top_n=TOP_N, local_cache_path=LOCAL_DATA_FOLDER)\n",
    "processor = ExtSumProcessor(model_name=MODEL_NAME, cache_dir=LOCAL_CACHE_DIR)\n",
    "ext_sum_train = processor.preprocess(train_dataset, oracle_mode=\"greedy\")\n",
    "ext_sum_test = processor.preprocess(test_dataset, oracle_mode=\"greedy\")\n",
    "save_path = os.path.join(LOCAL_DATA_FOLDER, \"processed\")\n",
    "os.makedirs(save_path, exist_ok=True)\n",
    "torch.save(ext_sum_train, os.path.join(save_path, TRAIN_FILE))\n",
    "torch.save(ext_sum_test, os.path.join(save_path, TEST_FILE))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds.upload(src_dir=save_path, target_path=TARGET_DATA_FOLDER)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare for the Experiment Run\n",
    "Prepare the local project folder which is mirror to the workspace for the experiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ENTRY_SCRIPT = \"extractive_summarization_cnndm_distributed_train.py\"\n",
    "os.makedirs(PROJECT_FOLDER, exist_ok=True)\n",
    "os.system(\"python ../../tools/generate_conda_file.py --gpu --name {}\".format(CONDA_ENV_NAME))\n",
    "os.system(\"cp ./nlp_gpu.yaml {}\".format(PROJECT_FOLDER))\n",
    "os.system(\"cp {} {}\".format(ENTRY_SCRIPT, PROJECT_FOLDER))\n",
    "os.system(\"cp -r ../../utils_nlp {}\".format(PROJECT_FOLDER))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Submit Run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs(LOCAL_OUTPUT_DIR, exist_ok=True)\n",
    "os.makedirs(os.path.join(LOCAL_OUTPUT_DIR, EXPERIMENT_NAME), exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "NcclConfig=Nccl()\n",
    "quick_run = \"true\" if QUICK_RUN else \"false\"\n",
    "estimator = PyTorch(source_directory=PROJECT_FOLDER,\n",
    "                    compute_target=gpu_compute_target,\n",
    "                    script_params={\n",
    "                        \"--dist_url\": \"$AZ_BATCHAI_PYTORCH_INIT_METHOD\",\n",
    "                        \"--rank\": \"$AZ_BATCHAI_TASK_INDEX\",\n",
    "                        \"--node_count\": NODE_COUNT,\n",
    "                        \"--data_dir\":ds.path(f'{TARGET_DATA_FOLDER}').as_mount(),\n",
    "                        \"--cache_dir\": ds.path(f'{TARGET_CACHE_DIR}').as_mount(),\n",
    "                        '--output_dir':ds.path(f'{TARGET_OUTPUT_DIR}').as_mount(),\n",
    "                        \"--quick_run\":  quick_run,\n",
    "                        \"--summary_filename\": f'{SUMMARY_FILENAME}',\n",
    "                        \"--model_filename\": f'{MODEL_FILENAME}',\n",
    "                        \"--model_name\": MODEL_NAME,\n",
    "                        \"--encoder\": ENCODER,\n",
    "                        \"--train_file\": TRAIN_FILE,\n",
    "                        \"--test_file\": TEST_FILE\n",
    "                    },\n",
    "                    entry_script= ENTRY_SCRIPT,\n",
    "                    node_count=NODE_COUNT,\n",
    "                    distributed_training=NcclConfig,\n",
    "                    conda_dependencies_file=f'{CONDA_ENV_NAME}.yaml',\n",
    "                    use_gpu=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "run = experiment.submit(estimator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "RunDetails(run).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "If you stop the notebook and come back, \n",
    "you'll need to use the run_id in the output of the previous cell \n",
    "to get run details.\n",
    "\"\"\"\n",
    "# fetched_run = Run(experiment, \"NLP-ExtSum_1579816237_ea238f69\")\n",
    "# RunDetails(fetched_run).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download Generated Summaries "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# need to clear the local output dir as the ds.download won't download if the path exists \n",
    "os.system(\"rm -rf  {}/*\".format(LOCAL_OUTPUT_DIR))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
    "                   prefix=f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}',\n",
    "                   show_progress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the script uses <q> as sentence separator so it can write the prediction into the files properly\n",
    "# here we need to replace <q> with \"\\n\" to prepare for evalation\n",
    "# removing the ending \"\\n\" is also a preparation step for evalution.\n",
    "prediction = []\n",
    "with open(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}'), \"r\") as filehandle:\n",
    "    for cnt, line in enumerate(filehandle):\n",
    "        prediction.append(line[0:-1].replace(\"<q>\", \"\\n\")) # remove the ending \"\\n\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prediction[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compare with gold summaries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "source = []\n",
    "temp_target = []\n",
    "for i in ext_sum_test:\n",
    "    source.append(i[\"src_txt\"]) \n",
    "    temp_target.append(\" \".join(j) for j in i['tgt']) \n",
    "target = ['\\n'.join(i) for i in list(temp_target)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "target[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "source[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download and evaluation the trained model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## you can also download the saved model and run prediction if you are running the notebook on a gpu machine\n",
    "#\"\"\"\n",
    "ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
    "               prefix=f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}',\n",
    "               show_progress=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "BATCH_SIZE = 32\n",
    "summarizer = ExtractiveSummarizer(processor, encoder=ENCODER, cache_dir=LOCAL_CACHE_DIR)\n",
    "summarizer.model.load_state_dict(\n",
    "    torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'),\n",
    "               map_location=\"cpu\"))\n",
    "\n",
    "prediction = summarizer.predict(ext_sum_test, num_gpus=torch.cuda.device_count(), batch_size=BATCH_SIZE, sentence_separator = \"\\n\")\n",
    "#\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prediction[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rouge_scores = compute_rouge_python(cand=prediction, ref=target)\n",
    "pprint.pprint(rouge_scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cleanup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import shutil\n",
    "if os.path.exists(LOCAL_DATA_FOLDER):\n",
    "    shutil.rmtree(LOCAL_DATA_FOLDER, ignore_errors=True)\n",
    "if os.path.exists(LOCAL_OUTPUT_DIR):\n",
    "    shutil.rmtree(LOCAL_OUTPUT_DIR, ignore_errors=True)\n",
    "if os.path.exists(LOCAL_CACHE_DIR):\n",
    "    shutil.rmtree(LOCAL_CACHE_DIR, ignore_errors=True)\n",
    "if os.path.exists(PROJECT_FOLDER):\n",
    "    shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (nlp_gpu)",
   "language": "python",
   "name": "nlp_gpu"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
